# created by Ramin Izadi

# This program cleans the xwalk maps in such a way that each year has a complete mapping to the base years.
# This way the crosswalks can be joined directly by year and industry/subregion code.


rm(list = ls())

library(stringr)
library(tidyverse)
library(haven) # for read_sas
library(readstata13) #for read.dta13
library(fuzzyjoin) # for joins by strings

source("functions.R")


# Occupation crosswalks ---------------------------------------------------


occ_xwalk_80_01 = read_dta("data_input/LUOKITUSAVAIN_AML1980_AML2001.dta")
names(occ_xwalk_80_01) = c("occ80", "occ01")

xwalk_occ_0110 = read_delim("data_input/xwalk_occ_0110.txt", delim = "\t", locale = locale(encoding = "latin1"), col_types = "ciccici")
names(xwalk_occ_0110) = c("occ01", "taso01", "occ_label01", "occ10", "taso10", "occ_label10", "osuus")

xwalk_occ_1001 = read_delim("data_input/xwalk_occ_1001.txt", delim = "\t", locale = locale(encoding = "latin1"), col_types = "ciccici")
names(xwalk_occ_1001) = c("occ10", "taso10", "occ_label10", "occ01", "taso01", "occ_label01", "osuus")

# This code creates the raw file, from which xwalk_occupations_2010_to_manual.csv is manually cleaned
# occs = xwalk_occ_1001 %>% select(taso10, occ_label10, occ10) %>%
#   distinct(occ10, .keep_all = T) %>%
#   mutate(occ_new = str_sub(occ10, 1, 3))
# write_delim(occs, "xwalk/raw/xwalk_occupations_2010_to_manual_raw.csv", delim = ",")

xwalk_occ_10_manual = read_csv("data_input/xwalk_occupations_2010_to_manual.csv", col_types = "iccic") %>% select(-notes)


xwalk_occ_01_manual = xwalk_occ_0110 %>% filter(osuus %in% c("0", "1")) %>% 
  arrange(-taso10) %>% 
  group_by(occ01) %>%  
  filter(row_number() == 1) %>% ungroup %>% 
  mutate(occ10 = str_replace(occ10, "\\*", "")) %>% 
  left_join(xwalk_occ_10_manual) #Join by c("occ10", "taso10", "occ_label10")



xwalk_occ_80_manual = xwalk_occ_0110 %>% filter(osuus %in% c("0", "1")) %>% 
  arrange(-taso10) %>% 
  group_by(occ01) %>%  
  filter(row_number() == 1) %>% ungroup %>% 
  mutate(occ10 = str_replace(occ10, "\\*", "")) %>% 
  left_join(xwalk_occ_10_manual) %>% #Join by c("occ10", "taso10", "occ_label10")
  left_join(occ_xwalk_80_01) %>% # Join by = "occ01"
  filter(!is.na(occ80))

x10 = xwalk_occ_10_manual %>% 
  select(ammattikoodi_k = occ10, occ_manual_new = occ_manual)
x01 = xwalk_occ_01_manual %>% 
  select(ammattikoodi_k = occ01, occ_manual_new = occ_manual)
x80 = xwalk_occ_80_manual %>% 
  select(ammattikoodi_k = occ80, occ_manual_new = occ_manual)
  

xwalk_occupation_manual = bind_rows(list(x80, x80, x80, x80, x80, # 1990 - 1994
                                         x01, x01, x01, x01, x01, x01, #1995 - 2000
                                         x01, x01, x01, x01, x01, x01, x01, x01, x01, #2001- 2009
                                         x10, x10, x10, x10, x10, x10, x10, x10, x10), # 2010 - 2018
                            .id = "vuosi") %>% 
  mutate(vuosi = as.numeric(vuosi) + 1989)

save(xwalk_occupation_manual, file = "data_output/xwalk_occupation_manual.Rdata")

# Make sure that something is mapped to every occ_manual category from 2001
xwalk_occ_10_manual %>% filter(!xwalk_occ_10_manual$occ_manual %in% xwalk_occ_01_manual$occ_manual)

